“How does land area within a state influence the income tax trends of the state?”
library(dplyr)
library(dcData)
library(mosaic)
library(tidyverse)
#install.packages(c('maps', 'mapdata', 'usdata')) ## For California example
library(usdata)
library(maps)
library(mapdata)
Tax2014 <- read.csv("https://www.irs.gov/pub/irs-soi/14zpallagi.csv") # Primary data
Tax2014
View(ZipGeography) # Secondary data
names(Tax2014)
[1] "STATEFIPS" "STATE" "zipcode" "agi_stub" "N1" "mars1" "MARS2" "MARS4" "PREP" "N2" "NUMDEP"
[12] "TOTAL_VITA" "VITA" "TCE" "A00100" "N02650" "A02650" "N00200" "A00200" "N00300" "A00300" "N00600"
[23] "A00600" "N00650" "A00650" "N00700" "A00700" "N00900" "A00900" "N01000" "A01000" "N01400" "A01400"
[34] "N01700" "A01700" "SCHF" "N02300" "A02300" "N02500" "A02500" "N26270" "A26270" "N02900" "A02900"
[45] "N03220" "A03220" "N03300" "A03300" "N03270" "A03270" "N03150" "A03150" "N03210" "A03210" "N03230"
[56] "A03230" "N03240" "A03240" "N04470" "A04470" "A00101" "N18425" "A18425" "N18450" "A18450" "N18500"
[67] "A18500" "N18300" "A18300" "N19300" "A19300" "N19700" "A19700" "N04800" "A04800" "N05800" "A05800"
[78] "N09600" "A09600" "N05780" "A05780" "N07100" "A07100" "N07300" "A07300" "N07180" "A07180" "N07230"
[89] "A07230" "N07240" "A07240" "N07220" "A07220" "N07260" "A07260" "N09400" "A09400" "N85770" "A85770"
[100] "N85775" "A85775" "N09750" "A09750" "N10600" "A10600" "N59660" "A59660" "N59720" "A59720" "N11070"
[111] "A11070" "N10960" "A10960" "N11560" "A11560" "N06500" "A06500" "N10300" "A10300" "N85530" "A85530"
[122] "N85300" "A85300" "N11901" "A11901" "N11902" "A11902"
head(Tax2014)
favstats(~A06500, data = Tax2014) # a06500 = income tax amount
str(Tax2014)
'data.frame': 166722 obs. of 127 variables:
$ STATEFIPS : int 1 1 1 1 1 1 1 1 1 1 ...
$ STATE : chr "AL" "AL" "AL" "AL" ...
$ zipcode : int 0 0 0 0 0 0 35004 35004 35004 35004 ...
$ agi_stub : int 1 2 3 4 5 6 1 2 3 4 ...
$ N1 : num 850050 491370 259540 164840 203650 ...
$ mars1 : num 481840 200750 75820 26730 18990 ...
$ MARS2 : num 115070 150290 142970 125410 177070 ...
$ MARS4 : num 240450 125560 34070 10390 5860 ...
$ PREP : num 479900 281350 156720 99750 122670 ...
$ N2 : num 1401930 1016010 589190 423300 565930 ...
$ NUMDEP : num 548630 375670 186770 133020 185150 ...
$ TOTAL_VITA: num 24840 10850 3170 1260 1260 ...
$ VITA : num 16660 7080 1680 700 900 ...
$ TCE : num 8180 3780 1490 560 360 0 0 0 0 0 ...
$ A00100 : num 11004990 17658446 15963943 14294375 27387096 ...
$ N02650 : num 850050 491370 259540 164840 203650 ...
$ A02650 : num 11187657 17836190 16117661 14422811 27664725 ...
$ N00200 : num 682860 425830 223910 143710 181410 ...
$ A00200 : num 8746419 14494884 12316371 10817987 20155298 ...
$ N00300 : num 95140 92610 82760 69880 113360 ...
$ A00300 : num 64688 69421 69005 62269 141176 ...
$ N00600 : num 43950 41040 39530 35700 69620 ...
$ A00600 : num 72642 96100 123290 126688 368076 ...
$ N00650 : num 38880 36250 35530 32500 64740 ...
$ A00650 : num 46689 64683 85619 90127 274484 ...
$ N00700 : num 13400 51450 62280 57000 107690 ...
$ A00700 : num 5825 27881 40993 43748 111331 ...
$ N00900 : num 145420 62500 39640 28060 38850 ...
$ A00900 : num 810441 250528 253529 232026 571411 ...
$ N01000 : num 35870 31950 31650 28920 59170 ...
$ A01000 : num 38739 75029 108577 139040 547842 ...
$ N01400 : num 37300 34210 28200 22560 32290 ...
$ A01400 : num 223749 317668 360311 378333 825820 ...
$ N01700 : num 107590 101020 72620 52890 69810 ...
$ A01700 : num 1047421 1791833 1757156 1584142 2690393 ...
$ SCHF : num 8170 8600 7680 6130 8720 2630 0 0 0 0 ...
$ N02300 : num 33740 18180 8900 5060 4820 ...
$ A02300 : num 97553 52682 27490 16508 17488 ...
$ N02500 : num 36220 88200 60570 40540 45910 ...
$ A02500 : num 64407 560917 882812 793454 1038902 ...
$ N26270 : num 8600 10430 10610 9960 24330 ...
$ A26270 : num 7880 52953 97042 116234 644799 ...
$ N02900 : num 151350 94650 68600 49710 71940 ...
$ A02900 : num 182667 177744 153718 128436 277629 ...
$ N03220 : num 2760 11500 8950 8540 12650 ...
$ A03220 : num 613 2756 2229 2312 3343 ...
$ N03300 : num 50 160 160 240 1250 2770 0 0 0 0 ...
$ A03300 : num 119 739 1192 1949 18046 ...
$ N03270 : num 8530 6810 4810 3530 7320 8520 40 0 30 0 ...
$ A03270 : num 30882 32249 27104 21584 56851 ...
$ N03150 : num 2440 6180 4850 3760 5530 1140 20 0 20 30 ...
$ A03150 : num 5935 20701 19580 17523 29992 ...
$ N03210 : num 19660 37140 27650 19190 22020 ...
$ A03210 : num 17306 39070 29802 21700 23255 ...
$ N03230 : num 5640 2050 2070 640 4460 0 20 0 30 0 ...
$ A03230 : num 16681 5249 4611 1216 9213 ...
$ N03240 : num 40 70 100 140 640 1660 0 0 0 0 ...
$ A03240 : num 4 45 115 206 2413 ...
$ N04470 : num 50060 106170 100170 82040 145150 ...
$ A04470 : num 697858 1595160 1713071 1587508 3395412 ...
$ A00101 : num 784033 4041356 6212309 7161929 20015351 ...
$ N18425 : num 21090 77320 81280 70090 132030 ...
$ A18425 : num 20029 129793 206019 242031 706154 ...
$ N18450 : num 22230 23420 15880 9930 11030 ...
$ A18450 : num 17571 28761 23310 17481 25051 ...
$ N18500 : num 25080 69770 80080 72620 135670 ...
$ A18500 : num 21675 56079 72769 76413 196303 ...
$ N18300 : num 48360 104850 99890 81940 145060 ...
$ A18300 : num 72527 240950 329474 361954 982577 ...
$ N19300 : num 23280 64890 73950 67010 123290 ...
$ A19300 : num 124980 349299 442924 458211 998836 ...
$ N19700 : num 37870 88100 85970 73330 134550 ...
$ A19700 : num 102592 331671 386048 390545 910154 ...
$ N04800 : num 328640 470700 258400 164610 203500 ...
$ A04800 : num 1830923 8618302 10205184 10045399 21014718 ...
$ N05800 : num 327060 468560 257860 164300 203230 ...
$ A05800 : num 194554 1043771 1404518 1452758 3631329 ...
$ N09600 : num 0 0 110 270 2270 ...
$ A09600 : num 0 0 76 322 5243 ...
$ N05780 : num 7290 6530 1740 410 210 0 0 40 0 0 ...
$ A05780 : num 2520 4477 2597 711 510 ...
$ N07100 : num 106200 215720 114100 76400 92860 ...
$ A07100 : num 36418 192756 168113 129131 126604 ...
$ N07300 : num 2470 5920 8040 7960 19520 ...
$ A07300 : num 66 265 515 693 5371 ...
$ N07180 : num 8490 28430 18010 15460 20550 ...
$ A07180 : num 2768 16078 9554 8450 11284 ...
$ N07230 : num 32980 45700 24340 17800 22740 ...
$ A07230 : num 16979 47932 31082 23168 31720 ...
$ N07240 : num 32000 67530 17950 0 0 ...
$ A07240 : num 5460 12516 2792 0 0 ...
$ N07220 : num 31350 122540 76950 54560 46520 ...
$ A07220 : num 9543 108415 116151 90111 63034 ...
$ N07260 : num 2240 10560 8970 6020 7940 ...
$ A07260 : num 604 4749 3975 2195 2660 ...
$ N09400 : num 116380 37790 26800 19600 28990 ...
$ A09400 : num 153768 80221 65138 54016 113649 ...
$ N85770 : num 24340 10620 1390 70 0 ...
$ A85770 : num 81465 33081 3678 88 0 ...
[list output truncated]
names(ZipGeography)
[1] "State" "Population" "HousingUnits" "LandArea" "WaterArea" "CityName" "PostOfficeName" "County"
[9] "AreaCode" "Timezone" "Latitude" "Longitude" "ZIP"
head(ZipGeography)
str(ZipGeography)
'data.frame': 42741 obs. of 13 variables:
$ State : Factor w/ 52 levels "","Massachusetts",..: 4 4 1 1 1 1 1 1 1 1 ...
$ Population : num 0 0 0 42042 55530 ...
$ HousingUnits : num 0 0 0 15590 21626 ...
$ LandArea : num 0.1 0 0 80.1 78.7 0 0 0 0 0 ...
$ WaterArea : num 46.3 170.3 4.7 0 0.1 ...
$ CityName : Factor w/ 18837 levels " ","Abington",..: 251 251 8 10 11 11 11 320 19 21 ...
$ PostOfficeName: Factor w/ 18928 levels "Abington","Accord",..: 249 249 7 9 10 10 10 318 18 20 ...
$ County : Factor w/ 1909 levels "Adjuntas","Aguada",..: 89 89 1 2 3 3 3 58 6 93 ...
$ AreaCode : num 631 631 787 787 787 787 787 787 787 787 ...
$ Timezone : Factor w/ 13 levels " ","EST","EST+1",..: 2 2 3 3 3 3 3 3 3 3 ...
$ Latitude : num 40.9 40.9 18.2 18.4 18.5 ...
$ Longitude : num -72.6 -72.6 -66.7 -67.2 -67.1 ...
$ ZIP : num 501 544 601 602 603 604 605 606 610 611 ...
ZipGeography <-
ZipGeography %>%
mutate(ZIP = as.numeric(ZIP))
narrow_table <-
Tax2014 %>%
pivot_longer(cols = c('mars1', 'MARS2', "MARS4"),
names_to = "typeofhouseholdreturns",
values_to = "numberofhouseholdreturns") # made the table narrower
narrow_table
JoinedTax2014 <-
narrow_table %>%
left_join(ZipGeography, by = c("zipcode" = "ZIP")) %>%
filter(zipcode != 0) # eliminate error in zipcode
JoinedTax2014 # joined table with corresponding zipcode
RevisedJoinedTax2014 <-
JoinedTax2014 %>%
group_by(STATE) %>%
summarise(STATE = STATE,
zipcode = zipcode,
agi_stub = agi_stub, # agi_stub = scale of income (1 to 6)
incometax = A06500,
averagestateincometax = mean(incometax),
minincometax = min(incometax),
maxincometax = max(incometax),
rangeincometax = maxincometax - minincometax,
Population = Population,
HousingUnits = HousingUnits,
CityName = CityName,
County = County,
AreaCode = AreaCode,
LandArea = LandArea,
WaterArea = WaterArea,
TypeofHouseholdReturns = typeofhouseholdreturns,
NumberofHouseholdReturns = numberofhouseholdreturns)
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
Please use `reframe()` instead.
When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.`summarise()` has grouped output by 'STATE'. You can override using the `.groups` argument.
RevisedJoinedTax2014 # Revised table with useful variables
regressMod <- lm(Population ~ incometax, data = RevisedJoinedTax2014)
regressMod$coefficients
(Intercept) incometax
9704.5524443 0.0727383
summary(regressMod)
Call:
lm(formula = Population ~ incometax, data = RevisedJoinedTax2014)
Residuals:
Min 1Q Median 3Q Max
-158052 -8465 -5978 4005 104282
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.705e+03 1.926e+01 503.9 <2e-16 ***
incometax 7.274e-02 4.642e-04 156.7 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 13220 on 488572 degrees of freedom
(10674 observations deleted due to missingness)
Multiple R-squared: 0.04785, Adjusted R-squared: 0.04785
F-statistic: 2.455e+04 on 1 and 488572 DF, p-value: < 2.2e-16
For our information, these are the average income taxes each state residents paid in 2014. Residents in California, New York, and Texas pays the highest income tax on average and residents in Vermont and Wyoming pays the lowest income tax on average. Our research question attempts to find the trend of these income taxes.
data <-
RevisedJoinedTax2014 %>%
summarise(State = STATE,
IncomeTax = mean(incometax),
agi_stub = agi_stub)
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
Please use `reframe()` instead.
When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.`summarise()` has grouped output by 'STATE'. You can override using the `.groups` argument.
data %>%
ggplot(aes(x = State, y = IncomeTax)) +
geom_bar(aes(x = reorder(State, -IncomeTax), color = State, fill = State), stat = "identity")+
labs(title = "Relationship between Income Tax and States")
We suspect that the population variable have some correlation to both land area and income tax, so the following three graphs attempt to draw that correlation in response to our research question.
This graph illustrates the correlation between land area of a state and its population. We can see a weak negative correlation between the two variables.
Graph1data <-
RevisedJoinedTax2014 %>%
summarise(LandArea = mean(LandArea, na.rm = TRUE),
Population = mean(Population, na.rm = TRUE))
Graph1data
Graph1data %>%
ggplot(aes(x = LandArea, y = Population)) +
geom_point(aes(color = STATE)) +
geom_smooth()+
labs(title = "Relationship between Population and Land Area in Different States")
This Graph shows that bigger the population, higher the tax rate - meaning that people who live in bigger cities tend to pay more tax.
Graph2data <-
RevisedJoinedTax2014 %>%
summarise(IncomeTax = mean(incometax, na.rm = TRUE),
Population = mean(Population, na.rm = TRUE))
Graph2data
Graph2data %>%
ggplot(aes(x = Population, y = IncomeTax)) +
geom_point(aes(color = STATE), stat = "identity") +
labs(title = "Relationship between Income Tax and Population Across Different States")
Our fourth graph tells us that bigger land areas are associated to lower income taxes.
RevisedJoinedTax2014 %>%
ggplot(aes(x = LandArea, y = incometax)) +
geom_point(aes(color = STATE)) +
facet_wrap(~ agi_stub, scales = "free_y",
labeller = label_both)+
labs(title = "Relationship between the Groups of Income Tax and Land Area across States ")
Let’s take a look at an example in California, a state with one of the highest income taxes. The graph below shows the relationship between land area and Average Income Tax in different counties of California.
california_counties <- map_data("county", region = "California")
california_counties <-
california_counties %>%
mutate(region = tools::toTitleCase(region)) %>%
mutate(subregion = tools::toTitleCase(subregion)) %>%
rename("State" = "region",
"County" = "subregion")
Test <-
JoinedTax2014 %>%
filter(STATE == "CA")
Test2 <- left_join(Test, california_counties, by = "County")
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
Test2 <- na.omit(Test2)
Yeah <- Test2 %>%
group_by(County) %>%
summarize(Average = mean(A06500, na.rm = TRUE))
Test3 <- left_join(Yeah, california_counties, by = "County")
Test3 <-
Test3 %>%
select(County, Average, long, lat, group, State)
land_area_data <- county_complete
Test5 <-
county_complete %>%
filter(state == "California") %>%
select(state, name, area_2010)
Test5$name <- gsub(" County", "", Test5$name)
Test5 <-
Test5 %>%
rename("State" = "state",
"County" = "name",
"Area" = "area_2010")
Test6 <- left_join(Test5, Test3, by = "County")
Test7 <-
Test6 %>%
distinct(County, .keep_all = TRUE)
Test7 <-
Test7 %>%
select(County, Area, Average, long, lat, group, State.y) %>%
rename("State" = "State.y")
ggplot(Test7, aes(x = Area, y = Average)) +
geom_point(aes(color = County)) +
labs(title = "Relationship between Average Income Tax and Land Area in California",
x = "Land Area (sq mi)",
y = "Average Income Tax")
If we were to map the density in a county map, our general conclusion would be proved in that counties with less land area tend to have higher income taxes.
ggplot(Test6, aes(x = long, y = lat, group = group, fill = Average)) +
geom_polygon(color = "white", size = 0.5) +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Average Income Tax") +
labs(title = "Map of the Density of Average Income Tax in California")
NA
#Conlusion
From the graphs above, we can see the weak negative correlation between land area of a state and its population. This is likely due to overpopulation in major states like New York and the opposite in states like Wyoming. Our next graph shows a strong positive correlation between population and income tax of each states. This is due to bigger cities having better infrastructures and higher cost of living. Our last graph shows the correlation between land area and the income tax (our research question). We can see a negative correlation which means that bigger land area is associated with lower income tax.
With the information given above, we can conclude that bigger land area is correlated with lower income tax and vice versa. This is because the states with large areas are located in the middle of country and face geological constraints like the Rocky Mountain. Thus, states with large areas tend to have smaller population and poor infrastructure. This leads to lower cost of living and lower tax rate for those states which directly effect the average income tax. With this in mind, we successfully concluded that large land area of state correlates with lower income tax.